#pip install pandas-profiling
#pip install python-docx
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from pandas_profiling import ProfileReport
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import os
from scipy.stats import pearsonr
original_data= pd.read_csv('Original Dataset_30 Variables_Not to be Shared.csv')
# original_report = ProfileReport(original_data, title='Original Data')
# original_report.to_file("original_report.html")
synthetic_data= pd.read_csv('CTGAN Generated data.csv')
# synthetic_data_report = ProfileReport(synthetic_data, title='Synthetic Data')
# synthetic_data_report.to_file("SyntheticData_report.html")
# comparison_report = original_report.compare(synthetic_data_report)
# comparison_report.to_file("original_vs_transformed.html")
## Distribution plots to compare original vs synthetic data vissually
def plot_distplot_pairs(df1, df2, data1_name, data2_name, save_path):
# Set the seaborn theme
sns.set_theme(style='whitegrid')
# Get the common columns between the two dataframes
common_columns = df1.columns.intersection(df2.columns)
# Create the directory if it doesn't exist
os.makedirs(save_path, exist_ok=True)
# Create a zip file to store the plots
with zipfile.ZipFile(f'{save_path}/distplots.zip', 'w') as zipf:
# Iterate over each common column
for column in common_columns:
# Create subplots for each pair of distplots
fig, axs = plt.subplots(1, 2, figsize=(10, 4))
# Plot distplot for df1
sns.distplot(df1[column], ax=axs[0])
axs[0].set_title(f'Distribution of {column} ({data1_name})')
# Plot distplot for df2
sns.distplot(df2[column], ax=axs[1])
axs[1].set_title(f'Distribution of {column} ({data2_name})')
# Adjust spacing between subplots
plt.tight_layout()
# Save the plot as an image
plot_filename = f'{column}.png'
plot_path = f'{save_path}/{plot_filename}'
plt.savefig(plot_path)
# Add the saved image to the zip file
zipf.write(plot_path, arcname=plot_filename)
# Show the plot
plt.show()
# Close the plot
plt.close()
print(f"Plots saved as '{save_path}/distplots.zip'")
data1_name = "Original Data"
data2_name = "Synthetic Data"
save_path = "plots_directory"
plot_distplot_pairs(original_data, synthetic_data, data1_name, data2_name, save_path)
Plots saved as 'plots_directory/distplots.zip'
# Dataframe Summary comparison
def calculate_summary_statistics(data):
summary_stats = data.describe().transpose()
summary_stats['median'] = data.median()
return summary_stats[['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'median']]
def compare_summary_statistics(original_data, synthetic_data):
original_stats = calculate_summary_statistics(original_data)
synthetic_stats = calculate_summary_statistics(synthetic_data)
comparison_table = pd.concat([original_stats, synthetic_stats], axis=1, keys=['Original Data', 'Synthetic Data'])
return comparison_table
summary_comparison_table = compare_summary_statistics(original_data, synthetic_data)
summary_comparison_table
| Original Data | Synthetic Data | |||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | mean | std | min | 25% | 50% | 75% | max | median | count | mean | std | min | 25% | 50% | 75% | max | median | |
| Feed N Plus 2A content | 7882.0 | 47.976056 | 2.037363 | 40.38 | 47.0200 | 48.150 | 49.3400 | 59.89 | 48.150 | 5962.0 | 47.972068 | 2.046191 | 40.38 | 47.0000 | 48.150 | 49.3800 | 59.89 | 48.150 |
| Reactor WAIT | 7882.0 | 982.706760 | 14.361024 | 956.40 | 968.5400 | 980.920 | 993.3200 | 1009.66 | 980.920 | 5962.0 | 982.563648 | 14.284562 | 963.04 | 968.5300 | 980.890 | 991.9275 | 1009.49 | 980.890 |
| H2 to HC | 7882.0 | 3.821718 | 0.372182 | 2.76 | 3.5600 | 3.780 | 4.0300 | 7.79 | 3.780 | 5962.0 | 3.814648 | 0.335320 | 3.10 | 3.5600 | 3.770 | 4.0300 | 5.66 | 3.770 |
| Reactor 1 Inlet Temp | 7882.0 | 975.360089 | 13.693660 | 941.75 | 964.7900 | 971.180 | 984.7675 | 1005.35 | 971.180 | 5962.0 | 975.295392 | 13.532721 | 945.74 | 964.7700 | 970.845 | 984.4475 | 1004.35 | 970.845 |
| Reactor 2 Inlet Temp | 7882.0 | 982.323996 | 15.599825 | 948.76 | 966.4900 | 982.290 | 994.5375 | 1016.96 | 982.290 | 5962.0 | 982.172801 | 15.434145 | 951.40 | 966.4700 | 981.850 | 994.2700 | 1015.79 | 981.850 |
| Reactor 3 Inlet Temp | 7882.0 | 983.784026 | 14.004152 | 958.95 | 970.5525 | 982.060 | 996.0050 | 1013.95 | 982.060 | 5962.0 | 983.606872 | 13.933120 | 960.06 | 970.4725 | 981.610 | 995.4500 | 1009.32 | 981.610 |
| Reactor 4 Inlet Temp | 7882.0 | 986.966524 | 15.722209 | 962.90 | 971.6300 | 986.580 | 996.5875 | 1019.20 | 986.580 | 5962.0 | 986.789906 | 15.674754 | 964.35 | 971.5500 | 986.385 | 996.1625 | 1015.62 | 986.385 |
| Reactor 1 Delta T | 7882.0 | 160.881586 | 7.579552 | 129.35 | 156.0400 | 161.110 | 166.4575 | 178.08 | 161.110 | 5962.0 | 161.030943 | 7.303361 | 134.38 | 156.2325 | 161.160 | 166.4075 | 176.95 | 161.160 |
| Reactor 2 Delta T | 7882.0 | 106.128895 | 6.387439 | 75.37 | 101.5000 | 106.155 | 110.4900 | 132.41 | 106.155 | 5962.0 | 106.133452 | 6.186524 | 86.24 | 101.5600 | 106.120 | 110.4500 | 128.67 | 106.120 |
| Reactor 3 Delta T | 7882.0 | 71.011284 | 4.095590 | 48.10 | 69.4400 | 71.310 | 73.1800 | 85.91 | 71.310 | 5962.0 | 71.095792 | 3.766175 | 52.29 | 69.5300 | 71.350 | 73.1375 | 83.26 | 71.350 |
| Reactor 4 Delta T | 7882.0 | 41.694635 | 4.379323 | 21.24 | 39.5500 | 42.500 | 44.7200 | 58.45 | 42.500 | 5962.0 | 41.762420 | 4.140208 | 22.69 | 39.6700 | 42.580 | 44.6700 | 51.74 | 42.580 |
| Reactor 1 Delta P | 7882.0 | 0.990741 | 0.345861 | 0.16 | 0.6500 | 1.090 | 1.3100 | 1.99 | 1.090 | 5962.0 | 0.992466 | 0.341575 | 0.45 | 0.6500 | 1.100 | 1.3100 | 1.88 | 1.100 |
| Reactor 2 Delta P | 7882.0 | 2.588273 | 0.093082 | 2.28 | 2.5200 | 2.570 | 2.6700 | 2.83 | 2.570 | 5962.0 | 2.587736 | 0.091843 | 2.33 | 2.5200 | 2.570 | 2.6700 | 2.82 | 2.570 |
| Reactor 3 Delta P | 7882.0 | 2.642166 | 0.135004 | 2.03 | 2.5400 | 2.660 | 2.7500 | 2.96 | 2.660 | 5962.0 | 2.640976 | 0.132266 | 2.15 | 2.5300 | 2.650 | 2.7500 | 2.92 | 2.650 |
| Reactor 4 Delta P | 7882.0 | 2.804509 | 0.241722 | 2.08 | 2.6400 | 2.840 | 2.9800 | 3.30 | 2.840 | 5962.0 | 2.805659 | 0.240884 | 2.13 | 2.6400 | 2.840 | 2.9800 | 3.28 | 2.840 |
| Seperator Pressure | 7882.0 | 33.333715 | 1.339706 | 20.93 | 32.4900 | 33.460 | 34.2700 | 37.05 | 33.460 | 5962.0 | 33.360704 | 1.310147 | 28.65 | 32.5100 | 33.490 | 34.3000 | 36.50 | 33.490 |
| Seperator Temperature | 7882.0 | 105.665434 | 4.046135 | 89.24 | 103.2600 | 105.760 | 108.3100 | 120.44 | 105.760 | 5962.0 | 105.697828 | 3.924914 | 92.96 | 103.2900 | 105.805 | 108.3075 | 119.30 | 105.805 |
| Recycle gas purity | 7882.0 | 81.346800 | 9.148243 | 0.86 | 80.8800 | 82.550 | 83.4500 | 86.82 | 82.550 | 5962.0 | 81.232031 | 9.579163 | 0.86 | 80.8800 | 82.550 | 83.4500 | 86.82 | 82.550 |
| Net gas Hydrogen Purity | 7882.0 | 89.323787 | 1.219458 | 86.11 | 88.2500 | 89.580 | 90.4400 | 91.40 | 89.580 | 5962.0 | 89.338293 | 1.220673 | 86.11 | 88.2500 | 89.590 | 90.4400 | 91.40 | 89.590 |
| Coke on Spent Catalyst | 7882.0 | 3.181111 | 0.821481 | 1.82 | 2.5000 | 3.210 | 3.8400 | 4.65 | 3.210 | 5962.0 | 3.165453 | 0.819124 | 1.82 | 2.5000 | 3.150 | 3.8300 | 4.65 | 3.150 |
| Chloride Injection rate | 7882.0 | 2.374787 | 0.343244 | 0.00 | 2.0800 | 2.490 | 2.5800 | 10.02 | 2.490 | 5962.0 | 2.385143 | 0.296423 | 0.84 | 2.1025 | 2.510 | 2.5800 | 3.16 | 2.510 |
| Total Paraffins in feed | 7882.0 | 64.251845 | 60.531753 | 48.92 | 60.8300 | 62.380 | 63.8600 | 2093.11 | 62.380 | 5962.0 | 62.431676 | 2.201652 | 48.92 | 60.8300 | 62.370 | 63.8600 | 69.95 | 62.370 |
| Total Naphthenes in feed | 7882.0 | 26.427558 | 2.739174 | 18.94 | 24.0900 | 26.810 | 28.4800 | 41.15 | 26.810 | 5962.0 | 26.454210 | 2.749912 | 18.94 | 24.1400 | 26.860 | 28.4900 | 41.15 | 26.860 |
| Total Aromatics in feed | 7882.0 | 10.773975 | 1.065247 | 7.19 | 9.9700 | 10.790 | 11.5600 | 13.39 | 10.790 | 5962.0 | 10.758345 | 1.069349 | 7.19 | 9.9600 | 10.770 | 11.5000 | 13.39 | 10.770 |
| Total olefins in Feed | 7882.0 | 0.208891 | 0.226918 | 0.03 | 0.1000 | 0.120 | 0.1900 | 1.56 | 0.120 | 5962.0 | 0.206850 | 0.213717 | 0.05 | 0.1000 | 0.120 | 0.2000 | 1.48 | 0.120 |
| Reactor LHSV | 7882.0 | 1.564491 | 0.085537 | 1.09 | 1.5000 | 1.570 | 1.6200 | 1.74 | 1.570 | 5962.0 | 1.566602 | 0.080772 | 1.17 | 1.5000 | 1.570 | 1.6200 | 1.74 | 1.570 |
| Feed IBP | 7882.0 | 198.726871 | 6.565372 | 186.23 | 194.0600 | 196.850 | 200.3200 | 217.40 | 196.850 | 5962.0 | 198.658737 | 6.455776 | 186.23 | 193.9700 | 196.850 | 200.0900 | 217.40 | 196.850 |
| 50% IBP | 7882.0 | 252.215292 | 2.962630 | 215.23 | 252.2700 | 252.560 | 252.9000 | 258.50 | 252.560 | 5962.0 | 252.245359 | 2.724750 | 215.23 | 252.2700 | 252.560 | 252.8900 | 258.50 | 252.560 |
| WABT | 7882.0 | 937.694057 | 15.687581 | 914.14 | 921.5100 | 936.035 | 947.5275 | 969.60 | 936.035 | 5962.0 | 937.485580 | 15.588433 | 915.36 | 921.4800 | 935.890 | 947.2200 | 967.72 | 935.890 |
| Plant C5PlusYield | 7882.0 | 84.979594 | 1.235512 | 79.48 | 84.0700 | 85.240 | 85.8400 | 88.79 | 85.240 | 5962.0 | 84.994794 | 1.209658 | 79.48 | 84.1300 | 85.260 | 85.8300 | 88.79 | 85.260 |
# Compute correlation matrix for original_data
original_corr = original_data.corr()
# Compute correlation matrix for synthetic_data
synthetic_corr = synthetic_data.corr()
# Plot correlation matrix for original_data
plt.figure(figsize=(20, 16))
sns.heatmap(original_corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix - Original Data')
plt.show()
# Plot correlation matrix for synthetic_data
plt.figure(figsize=(20, 16))
sns.heatmap(synthetic_corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix - Synthetic Data')
plt.show()
# Extract the "Plant C5PlusYield" column from both dataframes
original_target = original_data["Plant C5PlusYield"]
synthetic_target = synthetic_data["Plant C5PlusYield"]
# Initialize an empty dataframe to store the correlation results
correlation_df = pd.DataFrame(columns=["Column-Pair", "Original Correlation", "Synthetic Correlation"])
# Iterate through each column in the original_data dataframe
for column in original_data.columns:
# Skip the "Plant C5PlusYield" column
if column == "Plant C5PlusYield":
continue
# Calculate the Pearson correlation for the original_data dataframe
original_corr, _ = pearsonr(original_data[column], original_target)
# Calculate the Pearson correlation for the synthetic_data dataframe
synthetic_corr, _ = pearsonr(synthetic_data[column], synthetic_target)
# Append the results to the correlation dataframe
correlation_df = correlation_df.append({
"Column-Pair": column,
"Original Correlation": original_corr,
"Synthetic Correlation": synthetic_corr
}, ignore_index=True)
# Print the correlation dataframe
correlation_df
| Column-Pair | Original Correlation | Synthetic Correlation | |
|---|---|---|---|
| 0 | Feed N Plus 2A content | 0.382200 | 0.390723 |
| 1 | Reactor WAIT | -0.705501 | -0.707660 |
| 2 | H2 to HC | 0.072479 | 0.090593 |
| 3 | Reactor 1 Inlet Temp | -0.708477 | -0.718298 |
| 4 | Reactor 2 Inlet Temp | -0.673569 | -0.673391 |
| 5 | Reactor 3 Inlet Temp | -0.698320 | -0.697402 |
| 6 | Reactor 4 Inlet Temp | -0.658659 | -0.664151 |
| 7 | Reactor 1 Delta T | -0.340005 | -0.353268 |
| 8 | Reactor 2 Delta T | -0.401247 | -0.407639 |
| 9 | Reactor 3 Delta T | 0.393166 | 0.408004 |
| 10 | Reactor 4 Delta T | 0.602311 | 0.616397 |
| 11 | Reactor 1 Delta P | -0.441016 | -0.446243 |
| 12 | Reactor 2 Delta P | -0.529404 | -0.539138 |
| 13 | Reactor 3 Delta P | -0.152238 | -0.163390 |
| 14 | Reactor 4 Delta P | -0.243650 | -0.243682 |
| 15 | Seperator Pressure | 0.458794 | 0.480809 |
| 16 | Seperator Temperature | 0.335676 | 0.335589 |
| 17 | Recycle gas purity | 0.078186 | 0.077599 |
| 18 | Net gas Hydrogen Purity | 0.465102 | 0.468186 |
| 19 | Coke on Spent Catalyst | -0.251571 | -0.241133 |
| 20 | Chloride Injection rate | -0.009619 | -0.034867 |
| 21 | Total Paraffins in feed | 0.002509 | -0.333527 |
| 22 | Total Naphthenes in feed | 0.062026 | 0.075786 |
| 23 | Total Aromatics in feed | 0.287626 | 0.277741 |
| 24 | Total olefins in Feed | 0.392807 | 0.399858 |
| 25 | Reactor LHSV | 0.198023 | 0.193197 |
| 26 | Feed IBP | 0.360778 | 0.357431 |
| 27 | 50% IBP | 0.329538 | 0.362343 |
| 28 | WABT | -0.714266 | -0.716756 |
# Set the column pair as the index for the correlation table
correlation_df.set_index('Column-Pair', inplace=True)
# Plot the correlation values
correlation_df.plot(kind='bar', figsize=(18, 9))
plt.title('Correlation Comparison')
plt.xlabel('Column-Pair')
plt.ylabel('Pearson Correlation')
plt.xticks(rotation=45)
plt.legend(['Original Data', 'Synthetic Data'])
plt.show()